Project Intro

wait

## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

Data Preprocessing

StateOfInterest = c("Arizona", "California", "Minnesota", "New Mexico", "New York", 
                   "Oklahoma", "South Carolina", "Tennessee", "Utah", "Virginia", 
                   "West Virginia", "Wisconsin")

Demographic & Policy Data (County-Level)

library(readr)
county_data_abridged = read_csv("county_data_abridged.csv")
dim(county_data_abridged)
## [1] 3244   87
names(county_data_abridged)
##  [1] "countyFIPS"                        "STATEFP"                          
##  [3] "COUNTYFP"                          "CountyName"                       
##  [5] "StateName"                         "State"                            
##  [7] "lat"                               "lon"                              
##  [9] "POP_LATITUDE"                      "POP_LONGITUDE"                    
## [11] "CensusRegionName"                  "CensusDivisionName"               
## [13] "Rural-UrbanContinuumCode2013"      "PopulationEstimate2018"           
## [15] "PopTotalMale2017"                  "PopTotalFemale2017"               
## [17] "FracMale2017"                      "PopulationEstimate65+2017"        
## [19] "PopulationDensityperSqMile2010"    "CensusPopulation2010"             
## [21] "MedianAge2010"                     "#EligibleforMedicare2018"         
## [23] "MedicareEnrollment,AgedTot2017"    "3-YrDiabetes2015-17"              
## [25] "DiabetesPercentage"                "HeartDiseaseMortality"            
## [27] "StrokeMortality"                   "Smokers_Percentage"               
## [29] "RespMortalityRate2014"             "#FTEHospitalTotal2017"            
## [31] "TotalM.D.'s,TotNon-FedandFed2017"  "#HospParticipatinginNetwork2017"  
## [33] "#Hospitals"                        "#ICU_beds"                        
## [35] "dem_to_rep_ratio"                  "PopMale<52010"                    
## [37] "PopFmle<52010"                     "PopMale5-92010"                   
## [39] "PopFmle5-92010"                    "PopMale10-142010"                 
## [41] "PopFmle10-142010"                  "PopMale15-192010"                 
## [43] "PopFmle15-192010"                  "PopMale20-242010"                 
## [45] "PopFmle20-242010"                  "PopMale25-292010"                 
## [47] "PopFmle25-292010"                  "PopMale30-342010"                 
## [49] "PopFmle30-342010"                  "PopMale35-442010"                 
## [51] "PopFmle35-442010"                  "PopMale45-542010"                 
## [53] "PopFmle45-542010"                  "PopMale55-592010"                 
## [55] "PopFmle55-592010"                  "PopMale60-642010"                 
## [57] "PopFmle60-642010"                  "PopMale65-742010"                 
## [59] "PopFmle65-742010"                  "PopMale75-842010"                 
## [61] "PopFmle75-842010"                  "PopMale>842010"                   
## [63] "PopFmle>842010"                    "3-YrMortalityAge<1Year2015-17"    
## [65] "3-YrMortalityAge1-4Years2015-17"   "3-YrMortalityAge5-14Years2015-17" 
## [67] "3-YrMortalityAge15-24Years2015-17" "3-YrMortalityAge25-34Years2015-17"
## [69] "3-YrMortalityAge35-44Years2015-17" "3-YrMortalityAge45-54Years2015-17"
## [71] "3-YrMortalityAge55-64Years2015-17" "3-YrMortalityAge65-74Years2015-17"
## [73] "3-YrMortalityAge75-84Years2015-17" "3-YrMortalityAge85+Years2015-17"  
## [75] "mortality2015-17Estimated"         "stay at home"                     
## [77] ">50 gatherings"                    ">500 gatherings"                  
## [79] "public schools"                    "restaurant dine-in"               
## [81] "entertainment/gym"                 "federal guidelines"               
## [83] "foreign travel ban"                "SVIPercentile"                    
## [85] "HPSAShortage"                      "HPSAServedPop"                    
## [87] "HPSAUnderservedPop"
colnames(county_data_abridged)[colnames(county_data_abridged) 
                               == "#EligibleforMedicare2018"] = "EligibleforMedicare2018"
colnames(county_data_abridged)[colnames(county_data_abridged) 
                               == "#FTEHospitalTotal2017"] = "FTEHospitalTotal2017"
colnames(county_data_abridged)[colnames(county_data_abridged) 
                               == "#HospParticipatinginNetwork2017"] = "HospParticipatinginNetwork2017"
colnames(county_data_abridged)[colnames(county_data_abridged) 
                               == "#Hospitals"] = "Hospitals"
colnames(county_data_abridged)[colnames(county_data_abridged) 
                               == "#ICU_beds"] = "ICU_beds"
colnames(county_data_abridged)[colnames(county_data_abridged) 
                               == "PopulationEstimate65+2017"] = "PopulationEstimate_above65_2017"
colnames(county_data_abridged)[colnames(county_data_abridged) 
                               == "stay at home"] = "stay_at_home"
colnames(county_data_abridged)[colnames(county_data_abridged) 
                               == ">50 gatherings"] = "above_50_gatherings"
colnames(county_data_abridged)[colnames(county_data_abridged) 
                               == ">500 gatherings"] = "above_500_gatherings"
colnames(county_data_abridged)[colnames(county_data_abridged) 
                               == "public schools"] = "public_schools"
colnames(county_data_abridged)[colnames(county_data_abridged) 
                               == "restaurant dine-in"] = "restaurant_dine_in"
colnames(county_data_abridged)[colnames(county_data_abridged) 
                               == "entertainment/gym"] = "entertainment_gym"
colnames(county_data_abridged)[colnames(county_data_abridged) 
                               == "federal guidelines"] = "federal_guidelines"
colnames(county_data_abridged)[colnames(county_data_abridged) 
                               == "foreign travel ban"] = "foreign_travel_ban"
data = subset(county_data_abridged,
              select = c(State, CountyName, POP_LATITUDE, POP_LONGITUDE,
                         PopulationEstimate2018, PopTotalMale2017, 
                         PopulationEstimate_above65_2017, PopulationDensityperSqMile2010, 
                         DiabetesPercentage, Smokers_Percentage, 
                         HeartDiseaseMortality, StrokeMortality, 
                         Hospitals, ICU_beds, HospParticipatinginNetwork2017, 
                         stay_at_home, above_50_gatherings, above_500_gatherings, 
                         restaurant_dine_in, entertainment_gym))
data = na.omit(data)
data = droplevels(data)

data$stay_at_home = data$stay_at_home - range(data$stay_at_home)[1]
data$above_50_gatherings = data$above_50_gatherings - range(data$above_50_gatherings)[1]
data$above_500_gatherings = data$above_500_gatherings - range(data$above_500_gatherings)[1]
data$restaurant_dine_in = data$restaurant_dine_in - range(data$restaurant_dine_in)[1]
data$entertainment_gym = data$entertainment_gym - range(data$entertainment_gym)[1]

str(data)
## tibble [2,585 × 20] (S3: tbl_df/tbl/data.frame)
##  $ State                          : chr [1:2585] "Alabama" "Alabama" "Alabama" "Alabama" ...
##  $ CountyName                     : chr [1:2585] "Autauga" "Baldwin" "Barbour" "Bibb" ...
##  $ POP_LATITUDE                   : num [1:2585] 32.5 30.5 31.8 33 34 ...
##  $ POP_LONGITUDE                  : num [1:2585] -86.5 -87.8 -85.3 -87.1 -86.6 ...
##  $ PopulationEstimate2018         : num [1:2585] 55601 218022 24881 22400 57840 ...
##  $ PopTotalMale2017               : num [1:2585] 27007 103225 13335 12138 28607 ...
##  $ PopulationEstimate_above65_2017: num [1:2585] 8392 42413 4757 3632 10351 ...
##  $ PopulationDensityperSqMile2010 : num [1:2585] 91.8 114.7 31 36.8 88.9 ...
##  $ DiabetesPercentage             : num [1:2585] 9.9 8.5 15.7 13.3 14.9 22.4 16.9 15.6 17.5 12.2 ...
##  $ Smokers_Percentage             : num [1:2585] 18.1 17.5 22 19.1 19.2 ...
##  $ HeartDiseaseMortality          : num [1:2585] 204 183 220 226 225 ...
##  $ StrokeMortality                : num [1:2585] 56.1 41.9 49 57.2 52.8 54.1 59 44 45.2 47.3 ...
##  $ Hospitals                      : num [1:2585] 1 3 1 1 1 1 1 2 0 1 ...
##  $ ICU_beds                       : num [1:2585] 6 51 5 0 6 0 7 24 0 0 ...
##  $ HospParticipatinginNetwork2017 : num [1:2585] 0 0 0 0 1 0 0 0 0 0 ...
##  $ stay_at_home                   : num [1:2585] 16 16 16 16 16 16 16 16 16 16 ...
##  $ above_50_gatherings            : num [1:2585] 5 5 5 5 5 5 5 5 5 5 ...
##  $ above_500_gatherings           : num [1:2585] 2 2 2 2 2 2 2 2 2 2 ...
##  $ restaurant_dine_in             : num [1:2585] 7 7 7 7 7 7 7 7 7 7 ...
##  $ entertainment_gym              : num [1:2585] 16 16 16 16 16 16 16 16 16 16 ...
##  - attr(*, "na.action")= 'omit' Named int [1:659] 68 69 70 71 72 73 74 75 76 77 ...
##   ..- attr(*, "names")= chr [1:659] "68" "69" "70" "71" ...
data_demographic_county = data

Infection Data (State-Level)

Covid-19 Cases

timeseries = read_csv("../datasets/timeseries.csv")
data = timeseries
data = subset(data, country == "United States" & level == "state")
data = subset(data, !(name %in% c("Unassigned cases, Arkansas, US", 
              "Unassigned cases, Georgia, US", "Unassigned cases, Illinois, US",
              "Unassigned cases, Iowa, US", "Unassigned cases, Maine, US",
              "Unassigned cases, Massachusetts, US", "Unassigned cases, North Dakota, US",
              "Washington, D.C., US")))
data$state = matrix(unlist(strsplit(as.character(data$name), ", ")), ncol = 2, byrow = TRUE)[, 1]
data = subset(data,
              select = c(state, date, cases, deaths, recovered))

data = subset(data, state %in% StateOfInterest)

data$state =     as.factor(data$state)
data$cases =     as.numeric(data$cases)
data$deaths =    as.numeric(data$deaths)
data$recovered = as.numeric(data$recovered)

data = na.omit(data)
data = droplevels(data)

str(data)
## tibble [1,578 × 5] (S3: tbl_df/tbl/data.frame)
##  $ state    : Factor w/ 12 levels "Arizona","California",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ date     : Date[1:1578], format: "2020-04-14" "2020-04-15" ...
##  $ cases    : num [1:1578] 3806 3962 4234 4507 4719 ...
##  $ deaths   : num [1:1578] 131 142 150 169 177 184 187 208 229 249 ...
##  $ recovered: num [1:1578] 249 385 460 539 539 ...
##  - attr(*, "problems")= tibble [1,666,558 × 5] (S3: tbl_df/tbl/data.frame)
##   ..$ row     : int [1:1666558] 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 ...
##   ..$ col     : chr [1:1666558] "state" "state" "state" "state" ...
##   ..$ expected: chr [1:1666558] "1/0/T/F/TRUE/FALSE" "1/0/T/F/TRUE/FALSE" "1/0/T/F/TRUE/FALSE" "1/0/T/F/TRUE/FALSE" ...
##   ..$ actual  : chr [1:1666558] "Burgenland" "Burgenland" "Burgenland" "Burgenland" ...
##   ..$ file    : chr [1:1666558] "'../datasets/timeseries.csv'" "'../datasets/timeseries.csv'" "'../datasets/timeseries.csv'" "'../datasets/timeseries.csv'" ...
##  - attr(*, "na.action")= 'omit' Named int [1:894] 1 2 3 4 5 6 7 8 9 10 ...
##   ..- attr(*, "names")= chr [1:894] "1" "2" "3" "4" ...
range(data$cases)
## [1]    237 613076
range(data$deaths)
## [1]     1 25250
range(data$recovered)
## [1]      2 266287
data_cases_state = data

SEIR Model Parameters

model_out = read_csv("../datasets/model_out.csv")
str(model_out)
## tibble [73 × 10] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ state    : chr [1:73] "Arizona" "Arizona" "Arizona" "Arizona" ...
##  $ startdate: Date[1:73], format: "2020-04-16" "2020-05-01" ...
##  $ enddate  : Date[1:73], format: "2020-04-30" "2020-05-15" ...
##  $ k        : num [1:73] 0.783 0.863 0.865 0.949 0.986 ...
##  $ sigma    : num [1:73] 0.0714 0.0714 0.0714 0.0714 0.0714 ...
##  $ lamda    : num [1:73] 0.0563 0.07 0.1266 0.0784 0.0925 ...
##  $ c        : num [1:73] 1 0 0.127 1 0 ...
##  $ alpha    : num [1:73] 0.06825 0.00721 0.06765 0.10224 0.05641 ...
##  $ omega    : num [1:73] 0.00275 0.003091 0.001429 0.001371 0.000698 ...
##  $ miu      : num [1:73] 0.0384 0.0175 0.0338 0.036 0.014 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   state = col_character(),
##   ..   startdate = col_date(format = ""),
##   ..   enddate = col_date(format = ""),
##   ..   k = col_double(),
##   ..   sigma = col_double(),
##   ..   lamda = col_double(),
##   ..   c = col_double(),
##   ..   alpha = col_double(),
##   ..   omega = col_double(),
##   ..   miu = col_double()
##   .. )
data_parm = model_out

Combined Data of Intersts (State-Level)

data = data_parm

n = length(data$state)
data$cases = rep(0, n)
data$deaths = rep(0, n)
data$recovered = rep(0, n)

for (i in 1:n){
  j = data_cases_state$state == data$state[i] & data_cases_state$date == data$startdate[i]
  if (sum(j) == 0){
    data[i, ] = NA
  }else{
    data[i, c("cases", "deaths", "recovered")] = data_cases_state[j, c("cases", "deaths", "recovered")]
  }
}

data = na.omit(data)

data_demographic_state = as.data.frame(matrix(nrow = length(data$state), ncol = length(colnames(data_demographic_county)) - 1))
colnames(data_demographic_state) = colnames(data_demographic_county)[colnames(data_demographic_county) != "CountyName"]
data_demographic_state$State = data$state

for (s in data_demographic_state$State){
  for (k in 2:ncol(data_demographic_state)){
    data_demographic_state[data_demographic_state$State == s, k] = 
      mean(unlist(data_demographic_county[data_demographic_county$State == s, k+1]))
  }
}

data = cbind(data, 
             subset(data_demographic_state[, colnames(data_demographic_state)[colnames(data_demographic_state) != "State"]]))
data = na.omit(data)
data$days = rep(0, nrow(data))
data$temp = rep(0, nrow(data))
for (i in 1:nrow(data)){
  data$temp = diff.Date(c(data[i, ]$startdate, data[i, ]$enddate))
}
for (s in data$state){
  for (d in 1:nrow(subset(data, state == s))){
    data[data$state == s, "days"][d] = sum(subset(data, state == s)[1:d, "temp"])
  }
}
data = data[, colnames(data)[colnames(data) != "temp"]]
data_unamed = data[, colnames(data)[!(colnames(data) %in% c("state", "startdate", "enddate"))]]
# head(data_demographic_county, 10)
# head(data_demographic_state, 10)
# head(data[, c("state", "startdate", "enddate", "days")], 10)
head(data, 10)
##         state  startdate    enddate      k   sigma   lamda       c   alpha
## 1     Arizona 2020-04-16 2020-04-30 0.7834 0.07143 0.05633 1.00000 0.06825
## 2     Arizona 2020-05-01 2020-05-15 0.8631 0.07143 0.07001 0.00000 0.00721
## 3     Arizona 2020-05-16 2020-05-30 0.8653 0.07143 0.12657 0.12709 0.06765
## 4     Arizona 2020-06-01 2020-06-15 0.9492 0.07143 0.07839 1.00000 0.10224
## 5     Arizona 2020-06-16 2020-06-30 0.9864 0.07143 0.09253 0.00000 0.05641
## 6     Arizona 2020-07-01 2020-07-15 0.9732 0.07143 0.06093 0.00000 0.00000
## 7  California 2020-04-16 2020-04-30 0.8574 0.07143 0.07211 0.07752 0.02728
## 8  California 2020-05-01 2020-05-15 0.9481 0.07143 0.02832 1.00000 0.01925
## 9  California 2020-05-16 2020-05-30 0.9598 0.07143 0.04972 0.40540 0.03497
## 10 California 2020-06-01 2020-06-15 1.1270 0.07143 0.10206 0.01176 0.03073
##        omega      miu  cases deaths recovered POP_LATITUDE POP_LONGITUDE
## 1  0.0027505 0.038429   4234    150       460        33.58        -111.5
## 2  0.0030913 0.017520   7962    330      1528        33.58        -111.5
## 3  0.0014293 0.033751  13631    679      3357        33.58        -111.5
## 4  0.0013712 0.036006  20123    917      4869        33.58        -111.5
## 5  0.0006979 0.013973  39097   1219      6598        33.58        -111.5
## 6  0.0000000 0.003805  84092   1720      9715        33.58        -111.5
## 7  0.0023628 0.007923  28035    970      1753        37.82        -120.9
## 8  0.0013106 0.006909  52152   2131      5130        37.82        -120.9
## 9  0.0009157 0.012128  78704   3207      9098        37.82        -120.9
## 10 0.0006310 0.012709 115032   4219     17585        37.82        -120.9
##    PopulationEstimate2018 PopTotalMale2017 PopulationEstimate_above65_2017
## 1                  478110           232553                           80116
## 2                  478110           232553                           80116
## 3                  478110           232553                           80116
## 4                  478110           232553                           80116
## 5                  478110           232553                           80116
## 6                  478110           232553                           80116
## 7                  682018           338751                           94920
## 8                  682018           338751                           94920
## 9                  682018           338751                           94920
## 10                 682018           338751                           94920
##    PopulationDensityperSqMile2010 DiabetesPercentage Smokers_Percentage
## 1                           52.05             10.060              16.48
## 2                           52.05             10.060              16.48
## 3                           52.05             10.060              16.48
## 4                           52.05             10.060              16.48
## 5                           52.05             10.060              16.48
## 6                           52.05             10.060              16.48
## 7                          663.26              8.505              12.09
## 8                          663.26              8.505              12.09
## 9                          663.26              8.505              12.09
## 10                         663.26              8.505              12.09
##    HeartDiseaseMortality StrokeMortality Hospitals ICU_beds
## 1                  148.8           30.90     5.067    103.9
## 2                  148.8           30.90     5.067    103.9
## 3                  148.8           30.90     5.067    103.9
## 4                  148.8           30.90     5.067    103.9
## 5                  148.8           30.90     5.067    103.9
## 6                  148.8           30.90     5.067    103.9
## 7                  153.9           37.89     5.672    126.5
## 8                  153.9           37.89     5.672    126.5
## 9                  153.9           37.89     5.672    126.5
## 10                 153.9           37.89     5.672    126.5
##    HospParticipatinginNetwork2017 stay_at_home above_50_gatherings
## 1                           1.800           12                   2
## 2                           1.800           12                   2
## 3                           1.800           12                   2
## 4                           1.800           12                   2
## 5                           1.800           12                   2
## 6                           1.800           12                   2
## 7                           1.845            0                   4
## 8                           1.845            0                   4
## 9                           1.845            0                   4
## 10                          1.845            0                   4
##    above_500_gatherings restaurant_dine_in entertainment_gym days
## 1                     6                  7                 7   14
## 2                     6                  7                 7   28
## 3                     6                  7                 7   42
## 4                     6                  7                 7   56
## 5                     6                  7                 7   70
## 6                     6                  7                 7   84
## 7                     8                  3                 3   14
## 8                     8                  3                 3   28
## 9                     8                  3                 3   42
## 10                    8                  3                 3   56

Modeling & Testing Procedures

Data Points pairs() Plot

set.seed(42)
num_obs = nrow(data_unamed) # total number of observations
num_trn = round(num_obs * 0.90) # number of observations for the training data

trn_idx = sample(num_obs, num_trn) # randomly generate the index for the training data
data_trn = data_unamed[trn_idx, ] # training data
data_tst = data_unamed[-trn_idx, ] # testing data

Modeling k

# full additive model
mod_k_full = lm(k ~ ., data = data_trn)
test_mod(mod_k_full, k = 1)
## Warning in predict.lm(model, newdata = data_tst): prediction from a rank-
## deficient fit may be misleading

## Warning in predict.lm(model, newdata = data_tst): prediction from a rank-
## deficient fit may be misleading
## loocv_rmse     adj_r2 bp_pval.BP    sw_pval num_params  test_rmse   perc_err 
##   0.221852   0.475015   0.016221   0.000191  29.000000   0.149099  14.731106
summary(mod_k_full)
## 
## Call:
## lm(formula = k ~ ., data = data_trn)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.4647 -0.0452  0.0073  0.0503  0.2077 
## 
## Coefficients: (9 not defined because of singularities)
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      1.62e+01   3.93e+00    4.13  0.00019 ***
## sigma                                  NA         NA      NA       NA    
## lamda                           -6.46e+00   1.20e+00   -5.41  3.5e-06 ***
## c                               -3.05e-01   7.17e-02   -4.26  0.00012 ***
## alpha                            3.08e+00   1.02e+00    3.00  0.00465 ** 
## omega                            2.32e+01   2.59e+01    0.89  0.37630    
## miu                              4.41e+00   1.99e+00    2.21  0.03272 *  
## cases                           -4.57e-07   1.64e-06   -0.28  0.78232    
## deaths                           1.32e-05   1.91e-05    0.69  0.49356    
## recovered                       -1.54e-06   6.60e-06   -0.23  0.81645    
## POP_LATITUDE                    -2.01e-01   5.03e-02   -3.99  0.00028 ***
## POP_LONGITUDE                    6.94e-02   1.69e-02    4.11  0.00019 ***
## PopulationEstimate2018          -1.13e-04   2.90e-05   -3.91  0.00036 ***
## PopTotalMale2017                 2.15e-04   5.58e-05    3.85  0.00043 ***
## PopulationEstimate_above65_2017  7.20e-05   1.76e-05    4.10  0.00020 ***
## PopulationDensityperSqMile2010   2.45e-05   1.41e-04    0.17  0.86330    
## DiabetesPercentage              -8.36e-01   2.00e-01   -4.19  0.00015 ***
## Smokers_Percentage               3.32e-01   8.49e-02    3.91  0.00036 ***
## HeartDiseaseMortality           -2.90e-02   9.27e-03   -3.13  0.00331 ** 
## StrokeMortality                  1.61e-01   4.45e-02    3.62  0.00083 ***
## Hospitals                              NA         NA      NA       NA    
## ICU_beds                               NA         NA      NA       NA    
## HospParticipatinginNetwork2017         NA         NA      NA       NA    
## stay_at_home                           NA         NA      NA       NA    
## above_50_gatherings                    NA         NA      NA       NA    
## above_500_gatherings                   NA         NA      NA       NA    
## restaurant_dine_in                     NA         NA      NA       NA    
## entertainment_gym                      NA         NA      NA       NA    
## days                             3.16e-03   1.46e-03    2.17  0.03613 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.136 on 39 degrees of freedom
## Multiple R-squared:  0.647,  Adjusted R-squared:  0.475 
## F-statistic: 3.76 on 19 and 39 DF,  p-value: 0.000227
# small additive model
mod_k_1 = lm(k ~ days, data = data_trn)
# large additive model
mod_k_2 = lm(k ~ . - Hospitals - ICU_beds - HospParticipatinginNetwork2017 - stay_at_home - above_50_gatherings - above_500_gatherings - restaurant_dine_in - entertainment_gym - sigma, data = data_trn)

test_mod(mod_k_1, k = 1)
## loocv_rmse     adj_r2 bp_pval.BP    sw_pval num_params  test_rmse   perc_err 
##  1.867e-01  2.328e-02  9.083e-01  4.534e-13  2.000e+00  5.830e-02  5.580e+00
test_mod(mod_k_2, k = 1)
## loocv_rmse     adj_r2 bp_pval.BP    sw_pval num_params  test_rmse   perc_err 
##   0.221852   0.475015   0.016221   0.000191  20.000000   0.149099  14.731106
summary(mod_k_1)
## 
## Call:
## lm(formula = k ~ days, data = data_trn)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.2560 -0.0326  0.0395  0.0661  0.2381 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.805555   0.054607   14.75   <2e-16 ***
## days        0.001488   0.000964    1.54     0.13    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.186 on 57 degrees of freedom
## Multiple R-squared:  0.0401, Adjusted R-squared:  0.0233 
## F-statistic: 2.38 on 1 and 57 DF,  p-value: 0.128
summary(mod_k_2)
## 
## Call:
## lm(formula = k ~ . - Hospitals - ICU_beds - HospParticipatinginNetwork2017 - 
##     stay_at_home - above_50_gatherings - above_500_gatherings - 
##     restaurant_dine_in - entertainment_gym - sigma, data = data_trn)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.4647 -0.0452  0.0073  0.0503  0.2077 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      1.62e+01   3.93e+00    4.13  0.00019 ***
## lamda                           -6.46e+00   1.20e+00   -5.41  3.5e-06 ***
## c                               -3.05e-01   7.17e-02   -4.26  0.00012 ***
## alpha                            3.08e+00   1.02e+00    3.00  0.00465 ** 
## omega                            2.32e+01   2.59e+01    0.89  0.37630    
## miu                              4.41e+00   1.99e+00    2.21  0.03272 *  
## cases                           -4.57e-07   1.64e-06   -0.28  0.78232    
## deaths                           1.32e-05   1.91e-05    0.69  0.49356    
## recovered                       -1.54e-06   6.60e-06   -0.23  0.81645    
## POP_LATITUDE                    -2.01e-01   5.03e-02   -3.99  0.00028 ***
## POP_LONGITUDE                    6.94e-02   1.69e-02    4.11  0.00019 ***
## PopulationEstimate2018          -1.13e-04   2.90e-05   -3.91  0.00036 ***
## PopTotalMale2017                 2.15e-04   5.58e-05    3.85  0.00043 ***
## PopulationEstimate_above65_2017  7.20e-05   1.76e-05    4.10  0.00020 ***
## PopulationDensityperSqMile2010   2.45e-05   1.41e-04    0.17  0.86330    
## DiabetesPercentage              -8.36e-01   2.00e-01   -4.19  0.00015 ***
## Smokers_Percentage               3.32e-01   8.49e-02    3.91  0.00036 ***
## HeartDiseaseMortality           -2.90e-02   9.27e-03   -3.13  0.00331 ** 
## StrokeMortality                  1.61e-01   4.45e-02    3.62  0.00083 ***
## days                             3.16e-03   1.46e-03    2.17  0.03613 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.136 on 39 degrees of freedom
## Multiple R-squared:  0.647,  Adjusted R-squared:  0.475 
## F-statistic: 3.76 on 19 and 39 DF,  p-value: 0.000227
# intermediate model
mod_k_3 = lm(k ~ . - Hospitals - ICU_beds - HospParticipatinginNetwork2017 - stay_at_home - above_50_gatherings - above_500_gatherings - restaurant_dine_in - entertainment_gym - sigma
             - cases - deaths - omega - recovered - PopulationDensityperSqMile2010 
             + log(days) - days
             + stay_at_home - PopulationEstimate2018 - PopTotalMale2017
             + I(POP_LATITUDE ^ 2) +  + I(POP_LONGITUDE ^ 2), 
             data = data_trn)
mod_k_4 = lm(k ~ . - Hospitals - ICU_beds - HospParticipatinginNetwork2017 - stay_at_home - above_50_gatherings - above_500_gatherings - restaurant_dine_in - entertainment_gym - sigma
             - cases - deaths - omega - recovered - PopulationDensityperSqMile2010 + 
               log(days) - days, 
             data = data_trn)

test_mod(mod_k_1, k = 1)
## loocv_rmse     adj_r2 bp_pval.BP    sw_pval num_params  test_rmse   perc_err 
##  1.867e-01  2.328e-02  9.083e-01  4.534e-13  2.000e+00  5.830e-02  5.580e+00
test_mod(mod_k_2, k = 1)
## loocv_rmse     adj_r2 bp_pval.BP    sw_pval num_params  test_rmse   perc_err 
##   0.221852   0.475015   0.016221   0.000191  20.000000   0.149099  14.731106
test_mod(mod_k_3, k = 1)
## loocv_rmse     adj_r2 bp_pval.BP    sw_pval num_params  test_rmse   perc_err 
##  0.2054978  0.5247754  0.0017914  0.0001737 16.0000000  0.1552475 14.6026990
test_mod(mod_k_4, k = 1)
## loocv_rmse     adj_r2 bp_pval.BP    sw_pval num_params  test_rmse   perc_err 
##  0.2042750  0.5311550  0.0009837  0.0004907 15.0000000  0.1546657 14.4151641
summary(mod_k_3)
## 
## Call:
## lm(formula = k ~ . - Hospitals - ICU_beds - HospParticipatinginNetwork2017 - 
##     stay_at_home - above_50_gatherings - above_500_gatherings - 
##     restaurant_dine_in - entertainment_gym - sigma - cases - 
##     deaths - omega - recovered - PopulationDensityperSqMile2010 + 
##     log(days) - days + stay_at_home - PopulationEstimate2018 - 
##     PopTotalMale2017 + I(POP_LATITUDE^2) + +I(POP_LONGITUDE^2), 
##     data = data_trn)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.4742 -0.0548  0.0072  0.0526  0.2431 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      1.13e+03   4.10e+02    2.75   0.0087 ** 
## lamda                           -6.79e+00   1.13e+00   -6.02  3.4e-07 ***
## c                               -3.12e-01   6.69e-02   -4.67  2.9e-05 ***
## alpha                            3.83e+00   8.45e-01    4.54  4.5e-05 ***
## miu                              4.58e+00   1.71e+00    2.69   0.0102 *  
## POP_LATITUDE                    -2.11e+01   7.52e+00   -2.80   0.0076 ** 
## POP_LONGITUDE                    1.51e+01   5.58e+00    2.70   0.0098 ** 
## PopulationEstimate_above65_2017 -1.64e-04   6.58e-05   -2.50   0.0163 *  
## DiabetesPercentage              -8.55e+00   3.03e+00   -2.82   0.0072 ** 
## Smokers_Percentage               4.53e+00   1.62e+00    2.80   0.0077 ** 
## HeartDiseaseMortality           -8.73e-02   2.73e-02   -3.20   0.0026 ** 
## StrokeMortality                  4.52e-01   1.34e-01    3.38   0.0016 ** 
## log(days)                        1.02e-01   2.99e-02    3.41   0.0014 ** 
## stay_at_home                     1.14e+00   4.42e-01    2.59   0.0129 *  
## I(POP_LATITUDE^2)                2.65e-01   9.49e-02    2.79   0.0078 ** 
## I(POP_LONGITUDE^2)               7.85e-02   2.91e-02    2.70   0.0100 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.13 on 43 degrees of freedom
## Multiple R-squared:  0.648,  Adjusted R-squared:  0.525 
## F-statistic: 5.27 on 15 and 43 DF,  p-value: 8.7e-06
summary(mod_k_4)
## 
## Call:
## lm(formula = k ~ . - Hospitals - ICU_beds - HospParticipatinginNetwork2017 - 
##     stay_at_home - above_50_gatherings - above_500_gatherings - 
##     restaurant_dine_in - entertainment_gym - sigma - cases - 
##     deaths - omega - recovered - PopulationDensityperSqMile2010 + 
##     log(days) - days, data = data_trn)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.4732 -0.0497 -0.0017  0.0607  0.2366 
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     15.3989580  3.5627641    4.32  8.7e-05 ***
## lamda                           -6.8317626  1.1179898   -6.11  2.3e-07 ***
## c                               -0.3221741  0.0646726   -4.98  1.0e-05 ***
## alpha                            3.6975440  0.8116795    4.56  4.1e-05 ***
## miu                              4.9379779  1.6006925    3.08  0.00351 ** 
## POP_LATITUDE                    -0.1920080  0.0440854   -4.36  7.8e-05 ***
## POP_LONGITUDE                    0.0662907  0.0153167    4.33  8.5e-05 ***
## PopulationEstimate2018          -0.0001105  0.0000270   -4.09  0.00018 ***
## PopTotalMale2017                 0.0002107  0.0000518    4.06  0.00020 ***
## PopulationEstimate_above65_2017  0.0000669  0.0000154    4.35  8.1e-05 ***
## DiabetesPercentage              -0.7787347  0.1646073   -4.73  2.3e-05 ***
## Smokers_Percentage               0.3062246  0.0605033    5.06  7.9e-06 ***
## HeartDiseaseMortality           -0.0257963  0.0058214   -4.43  6.1e-05 ***
## StrokeMortality                  0.1439159  0.0288281    4.99  9.9e-06 ***
## log(days)                        0.0993978  0.0294363    3.38  0.00154 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.129 on 44 degrees of freedom
## Multiple R-squared:  0.644,  Adjusted R-squared:  0.531 
## F-statistic: 5.69 on 14 and 44 DF,  p-value: 4.16e-06
# intermediate model 
# relatively bad models
mod_k_5 = lm(k ~ . - Hospitals - ICU_beds - HospParticipatinginNetwork2017 - stay_at_home - above_50_gatherings - above_500_gatherings - restaurant_dine_in - entertainment_gym - sigma
             - (lamda + c + alpha + omega + miu), 
             data = data_trn)
mod_k_6 = lm(k ~ lamda + c + alpha + omega + miu, 
             data = data_trn)

test_mod(mod_k_1, k = 1)
## loocv_rmse     adj_r2 bp_pval.BP    sw_pval num_params  test_rmse   perc_err 
##  1.867e-01  2.328e-02  9.083e-01  4.534e-13  2.000e+00  5.830e-02  5.580e+00
test_mod(mod_k_2, k = 1)
## loocv_rmse     adj_r2 bp_pval.BP    sw_pval num_params  test_rmse   perc_err 
##   0.221852   0.475015   0.016221   0.000191  20.000000   0.149099  14.731106
test_mod(mod_k_3, k = 1)
## loocv_rmse     adj_r2 bp_pval.BP    sw_pval num_params  test_rmse   perc_err 
##  0.2054978  0.5247754  0.0017914  0.0001737 16.0000000  0.1552475 14.6026990
test_mod(mod_k_4, k = 1)
## loocv_rmse     adj_r2 bp_pval.BP    sw_pval num_params  test_rmse   perc_err 
##  0.2042750  0.5311550  0.0009837  0.0004907 15.0000000  0.1546657 14.4151641
test_mod(mod_k_5, k = 1)
## loocv_rmse     adj_r2 bp_pval.BP    sw_pval num_params  test_rmse   perc_err 
##  2.004e-01  1.101e-01  2.567e-01  2.605e-12  1.500e+01  9.543e-02  7.160e+00
test_mod(mod_k_6, k = 1)
## loocv_rmse     adj_r2 bp_pval.BP    sw_pval num_params  test_rmse   perc_err 
##  2.135e-01  8.328e-02  1.932e-02  7.076e-10  6.000e+00  5.237e-02  5.250e+00
summary(mod_k_5)
## 
## Call:
## lm(formula = k ~ . - Hospitals - ICU_beds - HospParticipatinginNetwork2017 - 
##     stay_at_home - above_50_gatherings - above_500_gatherings - 
##     restaurant_dine_in - entertainment_gym - sigma - (lamda + 
##     c + alpha + omega + miu), data = data_trn)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.9987 -0.0268  0.0054  0.0370  0.2963 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)  
## (Intercept)                      8.44e+00   4.21e+00    2.01    0.051 .
## cases                            5.03e-07   1.92e-06    0.26    0.795  
## deaths                           1.45e-05   2.31e-05    0.63    0.533  
## recovered                       -5.29e-06   7.18e-06   -0.74    0.465  
## POP_LATITUDE                    -9.83e-02   5.46e-02   -1.80    0.079 .
## POP_LONGITUDE                    3.81e-02   1.82e-02    2.10    0.042 *
## PopulationEstimate2018          -5.57e-05   2.94e-05   -1.90    0.064 .
## PopTotalMale2017                 1.06e-04   5.64e-05    1.88    0.067 .
## PopulationEstimate_above65_2017  3.67e-05   2.00e-05    1.83    0.073 .
## PopulationDensityperSqMile2010  -2.84e-05   1.52e-04   -0.19    0.852  
## DiabetesPercentage              -4.58e-01   2.28e-01   -2.01    0.051 .
## Smokers_Percentage               1.93e-01   9.88e-02    1.96    0.057 .
## HeartDiseaseMortality           -1.62e-02   1.09e-02   -1.48    0.147  
## StrokeMortality                  9.04e-02   5.25e-02    1.72    0.092 .
## days                             2.01e-03   1.60e-03    1.25    0.216  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.177 on 44 degrees of freedom
## Multiple R-squared:  0.325,  Adjusted R-squared:  0.11 
## F-statistic: 1.51 on 14 and 44 DF,  p-value: 0.147
summary(mod_k_6)
## 
## Call:
## lm(formula = k ~ lamda + c + alpha + omega + miu, data = data_trn)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.0289 -0.0445  0.0356  0.0886  0.2567 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   1.1002     0.0791   13.90   <2e-16 ***
## lamda        -2.7175     1.2218   -2.22    0.030 *  
## c            -0.1608     0.0772   -2.08    0.042 *  
## alpha         2.0142     1.1260    1.79    0.079 .  
## omega       -36.1277    24.4670   -1.48    0.146    
## miu           0.8043     1.0704    0.75    0.456    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.18 on 53 degrees of freedom
## Multiple R-squared:  0.162,  Adjusted R-squared:  0.0833 
## F-statistic: 2.05 on 5 and 53 DF,  p-value: 0.0859
mod_k = mod_k_5
diagnostics(mod_k, testit = FALSE)

Modeling sigma

# mod_sigma = 

Modeling lambda

# mod_lambda = 

Modeling c

# mod_c = 

Modeling alpha

# mod_alpha = 

Modeling omega

# mod_omega = 

Modeling miu

# mod_miu = 

Discussion

wait

  1. What we have achieved is a simple, interpretable, accessible model that anyone can get their hands on. It not only works towards assisting the prediction and prevention of pandemic, but also gives us clues on the relationship between variable social factors and the vulnerability of one place to the virus.

  2. This is our very first step to make our own contribution to the control of the pandemic. We believe that everyone can take a step to do their parts, and we need not be freaked out by the technological obstacles and the already-done achievements by the “professionals”.

  3. My partner and I are both undergraduates in non-CS majors, and this is our first time touching AWS or any other cloud service system. We started everything offline, accomplishing every tasks locally, and it did take us a while to figure out where to incorporate all those into AWS. But soon we see the enriched potentials and capability of AWS. Due to the limited time, we only took a slight bite of the AWS system, but we will be working to dig deeper.

  4. The next step to take is to integrate everything into an efficient cloud computing system so that larger data and more sophisticated models could be handled, especially in a well-automated manner.

  5. There are interesting stuffs like the political factor. We may wanna see how it affects We have learned a lot from this experience and we are looking forwards to becoming professional data analytist…blabla

Appendix

Helper Functions

get_bp_decision = function(model, alpha) {
  decide = unname(bptest(model)$p.value < alpha)
  ifelse(decide, "Reject", "Fail to Reject")
}

get_bp_pval = function(model) {
  bptest(model)$p.value
}

get_sw_decision = function(model, alpha) {
  decide = unname(shapiro.test(resid(model))$p.value < alpha)
  ifelse(decide, "Reject", "Fail to Reject")
}

get_sw_pval = function(model) {
  shapiro.test(resid(model))$p.value
}

get_num_params = function(model) {
  length(coef(model))
}

get_loocv_rmse = function(model, is_log) {
  ifelse(
    is_log, 
    sqrt(mean(na.omit(((dat_trn$price - exp(fitted(model))) / (1 - hatvalues(model))) ^ 2))),
    sqrt(mean((resid(model) / (1 - hatvalues(model))) ^ 2))
  )
}

get_adj_r2 = function(model) {
  summary(model)$adj.r.squared
}

test_mod = function(model, is_log = FALSE, k = 1){
  c(loocv_rmse = get_loocv_rmse(model, is_log), 
    adj_r2 = get_adj_r2(model), 
    bp_pval = get_bp_pval(model), 
    sw_pval = get_sw_pval(model), 
    num_params = get_num_params(model), 
    test_rmse = get_test_rmse(model, k), 
    perc_err = get_perc_err(model, k))
}

diagnostics = function(model, pcol = "grey", lcol = "dodgerblue", alpha = 0.05, plotit = TRUE, testit = TRUE){
  if (plotit){
    par(mfrow = c(1, 2), pty="s")
    
    plot(fitted(model), resid(model), col = "grey", pch = 20, 
         xlab = "Fitted", ylab = "Residual", 
         main = "Fitted versus Residuals")
    abline(h = 0, col = "darkorange", lwd = 2)
    
    qqnorm(resid(model), col = pcol)
    qqline(resid(model), col = lcol, lwd = 2)
  }
  if (testit){
    list(p_val = shapiro.test(resid(model))$p, 
         decision = ifelse(test = shapiro.test(resid(model))$p < alpha, 
                           yes = "Reject", no = "Fail to Reject"))
  }
}

get_test_rmse = function(model, k) {
  sqrt(mean((data_tst[, k] - predict(model, newdata = data_tst))^ 2))
}

get_perc_err = function(model, k) {
  actual = data_tst[, k]
  predicted = predict(model, newdata = data_tst)
  100 * mean((abs(actual - predicted)) / actual)
}